/*******************************************************************************
* Objective: Generate IPUMS industry distribution at country level
*******************************************************************************/

* Set Stata version
version 16

* Stablish Working Directory ***************************************************
cd "$workdirectory"

* Settings *********************************************************************
capture log close
clear all
set more off

/*******************************************************************************
Step 0: Run pre-required dos - The generated dataset will be used later when we 
append missing urban observations
*******************************************************************************/
do "codes/secondary_dos/ipums_urban_shares_data_creator.do"

/*******************************************************************************
Step 1: Importing and combining raw datasets
*******************************************************************************/

foreach fi in lac asia africa fiji png{
	import delimited "raw_datasets/IPUMS_labor/`fi'_indgen_all.csv", clear
	tempfile ipumsdata_`fi'
	save `ipumsdata_`fi'', replace
}

* Combine LAC, Asia and Africa data
clear all
use `ipumsdata_lac', clear
append using `ipumsdata_asia'
append using `ipumsdata_africa'

/*******************************************************************************
Step 2: Process Data
*******************************************************************************/

* Keep weighted # of workers
keep if type_var == "Weighted N"
drop type_var
	
* Append Fiji and PNG observations
append using `ipumsdata_fiji'
append using `ipumsdata_png'
drop if type_var == "Column percent"
drop type_var
	
* Generating country and year variable
drop if census == "COL TOTAL"
gen country = regexs(0) if regexm(census, "(([a-zA-Z]+)[ ]*([a-zA-Z]+))")
gen year = regexs(0) if(regexm(census, "[0-9][0-9][0-9][0-9]$"))
drop census
destring year, replace

* Append missing urban observations
append using "processed_datasets\urban_shares_countries_indgen_all"
	
* Creating Shares

gen agri = (agriculturefishingandforestry/(rowtotal-niunotinuniverse))*100
la var agri "10 Agriculture, fishing, and forestry"

gen mining = (miningandextraction/(rowtotal-niunotinuniverse))*100
la var mining "20 Mining and extraction"

gen mfg = (manufacturing/(rowtotal-niunotinuniverse))*100
la var mfg "30 Manufacturing"

gen utilities = (electricitygaswaterandwastemanag/(rowtotal-niunotinuniverse))*100
la var utilities "40 Electricity, gas, water and waste management"

gen construct = (construction/(rowtotal-niunotinuniverse))*100
la var construct "50 Construction"

gen trade = (wholesaleandretailtrade/(rowtotal-niunotinuniverse))*100
la var trade "60 Wholesale and retail trade"

gen hospitality = (hotelsandrestaurants/(rowtotal-niunotinuniverse))*100
la var hospitality "70 Hotels and restaurants"

gen transport = (transportationstorageandcommunic/(rowtotal-niunotinuniverse))*100
la var transport "80 Transportation, storage, and communications"

gen finance = (financialservicesandinsurance/(rowtotal-niunotinuniverse))*100
la var finance "90 Financial services and insurance"

gen govmt = (publicadministrationanddefense/(rowtotal-niunotinuniverse))*100
la var govmt "100 Public administration and defense"

gen unspec_service = (servicesnotspecified/(rowtotal-niunotinuniverse))*100
la var unspec_service "110 Services, not specified"

gen buss_service = (businessservicesandrealestate/(rowtotal-niunotinuniverse))*100
la var buss_service "111 Business services and real estate"

gen educ = (education/(rowtotal-niunotinuniverse))*100
la var educ "112 Education"

gen health = (healthandsocialwork/(rowtotal-niunotinuniverse))*100
la var health "113 Health and social work"

gen other_services = (otherservices/(rowtotal-niunotinuniverse))*100
la var other_services "114 Other services"

gen household_serv = (privatehouseholdservices/(rowtotal-niunotinuniverse))*100
la var household_serv "120 Private household services"

gen other_industry = (otherindustrynec/(rowtotal-niunotinuniverse))*100
la var other_industry "130 Other industry, n.e.c."

gen supressed = (responsesuppressed/(rowtotal-niunotinuniverse))*100
la var supressed "998 Response suppressed"

gen unknowns = (unknown/(rowtotal-niunotinuniverse))*100
la var unknowns "999 Unknown"

* Keep relevant variables
keep agri mining mfg utilities construct trade hospitality transport finance govmt unspec_service buss_service educ health other_services household_serv other_industry supressed unknowns country year type_share year_share

* rename/drop countries for compatibility with rest of datasets
drop if country == "Armenia"
drop if country == "Palestine"
drop if country == "Saint Lucia"
drop if country == "Israel"
drop if country == "Kyrgyz Republic"
replace country = "Venezuela, RB" if country == "Venezuela"
replace country = "Iran, Islamic Rep." if country == "Iran"
replace country = "Egypt, Arab Rep." if country == "Egypt"
replace country = "Lao PDR" if country == "Laos"
replace country = "Papua New Guinea" if country == "Papua New"
drop if country == "South Sudan"

save "processed_datasets\dataset_industry_ipums_all(09.16.2021).dta", replace
